昨天我們介紹了繼LeNet-5與AlexNet之後的經典CNN模型:VGG,透過使用小捲積核取代大捲積核的方式,讓模型可以設計得更深更廣,也減少了一些參數量。今天我們以最常被使用的VGG16架構為例,用Pytorch在CIFAR10上實戰看看!
import torch
import torchvision.models as models
# Load a pre-trained VGG model (VGG16 in this case)
pretrained_vgg = models.vgg16(pretrained=True) #有使用預訓練權重
# pretrained_vgg = models.vgg16() #沒有使用預訓練權重
# Print the pre-trained VGG model architecture
print(pretrained_vgg)
nn.Sequential()
將幾個層或是運算包裝在一組,這樣要使用他們的時候就可以一起叫出來,不需要一個個叫,如果將模型架構print出來看的話也會比較整潔。import torch
import torch.nn as nn
# Define the VGG model architecture
class VGG(nn.Module):
def __init__(self, num_classes=1000):
super(VGG, self).__init__()
self.features = nn.Sequential(
nn.Conv2d(3, 64, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(64, 64, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2),
nn.Conv2d(64, 128, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(128, 128, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2),
nn.Conv2d(128, 256, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(256, 256, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(256, 256, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2),
nn.Conv2d(256, 512, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(512, 512, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(512, 512, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2),
nn.Conv2d(512, 512, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(512, 512, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(512, 512, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2),
)
self.avgpool = nn.AdaptiveAvgPool2d((7, 7))
self.classifier = nn.Sequential(
nn.Linear(512 * 7 * 7, 4096),
nn.ReLU(inplace=True),
nn.Dropout(),
nn.Linear(4096, 4096),
nn.ReLU(inplace=True),
nn.Dropout(),
nn.Linear(4096, num_classes),
)
def forward(self, x):
x = self.features(x)
x = self.avgpool(x)
x = x.view(x.size(0), -1)
x = self.classifier(x)
return x
# Create a VGG model instance
vgg_model = VGG(num_classes=1000)
# Print the VGG model architecture
print(vgg_model)
nn.Sequential()
的話會像是下面這樣:import torch
import torch.nn as nn
# Define the VGG model architecture
class VGG(nn.Module):
def __init__(self, num_classes=1000):
super(VGG, self).__init__()
self.conv1_1 = nn.Conv2d(3, 64, kernel_size=3, padding=1)
self.relu1_1 = nn.ReLU(inplace=True)
self.conv1_2 = nn.Conv2d(64, 64, kernel_size=3, padding=1)
self.relu1_2 = nn.ReLU(inplace=True)
self.maxpool1 = nn.MaxPool2d(kernel_size=2, stride=2)
self.conv2_1 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
self.relu2_1 = nn.ReLU(inplace=True)
self.conv2_2 = nn.Conv2d(128, 128, kernel_size=3, padding=1)
self.relu2_2 = nn.ReLU(inplace=True)
self.maxpool2 = nn.MaxPool2d(kernel_size=2, stride=2)
self.conv3_1 = nn.Conv2d(128, 256, kernel_size=3, padding=1)
self.relu3_1 = nn.ReLU(inplace=True)
self.conv3_2 = nn.Conv2d(256, 256, kernel_size=3, padding=1)
self.relu3_2 = nn.ReLU(inplace=True)
self.conv3_3 = nn.Conv2d(256, 256, kernel_size=3, padding=1)
self.relu3_3 = nn.ReLU(inplace=True)
self.maxpool3 = nn.MaxPool2d(kernel_size=2, stride=2)
self.conv4_1 = nn.Conv2d(256, 512, kernel_size=3, padding=1)
self.relu4_1 = nn.ReLU(inplace=True)
self.conv4_2 = nn.Conv2d(512, 512, kernel_size=3, padding=1)
self.relu4_2 = nn.ReLU(inplace=True)
self.conv4_3 = nn.Conv2d(512, 512, kernel_size=3, padding=1)
self.relu4_3 = nn.ReLU(inplace=True)
self.maxpool4 = nn.MaxPool2d(kernel_size=2, stride=2)
self.conv5_1 = nn.Conv2d(512, 512, kernel_size=3, padding=1)
self.relu5_1 = nn.ReLU(inplace=True)
self.conv5_2 = nn.Conv2d(512, 512, kernel_size=3, padding=1)
self.relu5_2 = nn.ReLU(inplace=True)
self.conv5_3 = nn.Conv2d(512, 512, kernel_size=3, padding=1)
self.relu5_3 = nn.ReLU(inplace=True)
self.maxpool5 = nn.MaxPool2d(kernel_size=2, stride=2)
self.avgpool = nn.AdaptiveAvgPool2d((7, 7))
self.classifier1 = nn.Linear(512 * 7 * 7, 4096)
self.classifier2 = nn.ReLU(inplace=True)
self.classifier3 = nn.Dropout()
self.classifier4 = nn.Linear(4096, 4096)
self.classifier5 = nn.ReLU(inplace=True)
self.classifier6 = nn.Dropout()
self.classifier7 = nn.Linear(4096, num_classes)
def forward(self, x):
x = self.relu1_1(self.conv1_1(x))
x = self.relu1_2(self.conv1_2(x))
x = self.maxpool1(x)
x = self.relu2_1(self.conv2_1(x))
x = self.relu2_2(self.conv2_2(x))
x = self.maxpool2(x)
x = self.relu3_1(self.conv3_1(x))
x = self.relu3_2(self.conv3_2(x))
x = self.relu3_3(self.conv3_3(x))
x = self.maxpool3(x)
x = self.relu4_1(self.conv4_1(x))
x = self.relu4_2(self.conv4_2(x))
x = self.relu4_3(self.conv4_3(x))
x = self.maxpool4(x)
x = self.relu5_1(self.conv5_1(x))
x = self.relu5_2(self.conv5_2(x))
x = self.relu5_3(self.conv5_3(x))
x = self.maxpool5(x)
x = self.avgpool(x)
x = x.view(x.size(0), -1)
x = self.classifier1(x)
x = self.classifier2(x)
x = self.classifier3(x)
x = self.classifier4(x)
x = self.classifier5(x)
x = self.classifier6(x)
x = self.classifier7(x)
return x
# Create a VGG model instance
vgg_model = VGG(num_classes=1000)
# Print the VGG model architecture
print(vgg_model)
nn.Sequential()
常被用來包裝小範圍的層的話,那麼我們也可以將大範圍的東西包裝在一起。在模型架構比較複雜的情況下通常會採用這樣的寫法:import torch
import torch.nn as nn
# Define the FeatureExtractor class
class FeatureExtractor(nn.Module):
def __init__(self):
super(FeatureExtractor, self).__init__()
self.features = nn.Sequential(
nn.Conv2d(3, 64, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(64, 64, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2),
nn.Conv2d(64, 128, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(128, 128, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2),
nn.Conv2d(128, 256, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(256, 256, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(256, 256, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2),
nn.Conv2d(256, 512, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(512, 512, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(512, 512, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2),
nn.Conv2d(512, 512, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(512, 512, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(512, 512, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2),
)
self.avgpool = nn.AdaptiveAvgPool2d((7, 7))
def forward(self, x):
x = self.features(x)
x = self.avgpool(x)
return x
# Define the Classifier class
class Classifier(nn.Module):
def __init__(self, num_classes=1000):
super(Classifier, self).__init__()
self.classifier = nn.Sequential(
nn.Linear(512 * 7 * 7, 4096),
nn.ReLU(inplace=True),
nn.Dropout(),
nn.Linear(4096, 4096),
nn.ReLU(inplace=True),
nn.Dropout(),
nn.Linear(4096, num_classes),
)
def forward(self, x):
x = x.view(x.size(0), -1)
x = self.classifier(x)
return x
# Combine the FeatureExtractor and Classifier to create the VGG model
class VGG(nn.Module):
def __init__(self, num_classes=1000):
super(VGG, self).__init__()
self.features = FeatureExtractor()
self.classifier = Classifier(num_classes=num_classes)
def forward(self, x):
x = self.features(x)
x = self.classifier(x)
return x
# Create a VGG model instance
vgg_model = VGG(num_classes=1000)
print(vgg_model)
nn.Sequential()
包裝小範圍,以及使用額外的Class包裝大範圍的架構。可以試著跑一下上面的程式碼,看看這三者在輸出結果上有甚麼差異。model = VGG(num_classes=10)
。import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
# Hyperparameters
batch_size = 64
learning_rate = 0.001
num_epochs = 10
# Data preprocessing and loading
transform = transforms.Compose([transforms.Resize((224,224)),
transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
train_dataset = torchvision.datasets.CIFAR10(root='./data', train=True, transform=transform, download=True)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_dataset = torchvision.datasets.CIFAR10(root='./data', train=False, transform=transform)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)
# Initialize the VGG model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = VGG(num_classes=10).to(device)
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
# Training loop
total_step = len(train_loader)
for epoch in range(num_epochs):
model.train()
for i, (images, labels) in enumerate(train_loader):
outputs = model(images.to(device))
loss = criterion(outputs, labels.to(device))
optimizer.zero_grad()
loss.backward()
optimizer.step()
if (i + 1) % 100 == 0:
print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{total_step}], Loss: {loss.item():.4f}')
# Evaluation
model.eval()
with torch.no_grad():
correct = 0
total = 0
for images, labels in test_loader:
outputs = model(images.to(device))
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels.to(device)).sum().item()
print(f'Test Accuracy: {100 * correct / total}%')
224x224
,而batch size(一次提供幾筆資料給模型)為64,對於一般的電腦來說可能會超過記憶體的上限,可以通過將batch size調小一點來解決這個問題。(比較不建議調整輸入圖片大小,因為這樣模型當中的某些層會需要跟著調整,對於不熟悉的人來說比較困難)